rm(list = ls())
setwd("/Users/John/Dropbox")

# --- Load Required Packages ---
library(readxl)
library(readr)
library(stringr)
library(dplyr)

# --- SOE Output Data ---
soe <- read_excel("JOP_Replication_Materials/data/raw/nbs_soe.xlsx") %>%
  mutate(csic2 = str_pad(csic2, 2, pad = "0")) %>%
  dplyr::select(year, csic2, soe_output = `Gross Industrial Output Value`)

# --- National Total Output ---
total <- read_excel("JOP_Replication_Materials/data/raw/nbs_total.xlsx") %>%
  mutate(csic2 = str_pad(csic2, 2, pad = "0")) %>%
  dplyr::select(year, csic2, nat_output = `Gross Industrial Output Value`)

# --- Merge SOE and Total Output ---
joined <- left_join(total, soe, by = c("year", "csic2")) %>%
  distinct(year, csic2, .keep_all = TRUE) %>%
  mutate(soe_share = soe_output / nat_output) %>%
  arrange(csic2)

# --- Load CSIC-ISIC Correspondence Table ---
isic <- read_csv("JOP_Replication_Materials/data/raw/cic_isic_correspondence_table.csv") %>%
  mutate(
    isic = str_pad(isic, 4, pad = "0"),
    cic = str_pad(cic, 4, pad = "0"),
    csic2 = substr(cic, 1, 2)
  )

# --- Merge Output with ISIC ---
joined2 <- left_join(joined, isic, by = "csic2", relationship = "many-to-many") %>%
  group_by(isic) %>%
  mutate(median_soeshare_isic = median(soe_share, na.rm = TRUE)) %>%
  ungroup() %>%
  distinct(year, isic, .keep_all = TRUE)

# --- Write Output ---
write_csv(joined2, "JOP_Replication_Materials/data/processed/soe_share.csv")
